library loading

library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1       ✔ purrr   0.3.2  
## ✔ tibble  2.1.1       ✔ dplyr   0.8.0.1
## ✔ tidyr   0.8.3       ✔ stringr 1.4.0  
## ✔ readr   1.3.1       ✔ forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.5.2
## Warning: package 'tibble' was built under R version 3.5.2
## Warning: package 'tidyr' was built under R version 3.5.2
## Warning: package 'purrr' was built under R version 3.5.2
## Warning: package 'dplyr' was built under R version 3.5.2
## Warning: package 'stringr' was built under R version 3.5.2
## Warning: package 'forcats' was built under R version 3.5.2
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths

loading data

pizza_jared <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-01/pizza_jared.csv")
## Parsed with column specification:
## cols(
##   polla_qid = col_double(),
##   answer = col_character(),
##   votes = col_double(),
##   pollq_id = col_double(),
##   question = col_character(),
##   place = col_character(),
##   time = col_double(),
##   total_votes = col_double(),
##   percent = col_double()
## )
pizza_barstool <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-01/pizza_barstool.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   name = col_character(),
##   address1 = col_character(),
##   city = col_character(),
##   country = col_character()
## )
## See spec(...) for full column specifications.
pizza_datafiniti <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-01/pizza_datafiniti.csv")
## Parsed with column specification:
## cols(
##   name = col_character(),
##   address = col_character(),
##   city = col_character(),
##   country = col_character(),
##   province = col_character(),
##   latitude = col_double(),
##   longitude = col_double(),
##   categories = col_character(),
##   price_range_min = col_double(),
##   price_range_max = col_double()
## )

DATA PREVIEWING

lets view the data

First I Want to see which cities have a lot of reviews

count(pizza_barstool, city)
## # A tibble: 99 x 2
##    city           n
##    <chr>      <int>
##  1 Alpharetta     1
##  2 Ann Arbor      4
##  3 Atlanta        6
##  4 Augusta        1
##  5 Austin         1
##  6 Austintown     1
##  7 Blacksburg     1
##  8 Boston        13
##  9 Braintree      1
## 10 Brockton       1
## # … with 89 more rows

This gives us ten pages of data. Some cities only one have one review, so lets eliminate those.

pizza_jared %>%
  count(place, sort = TRUE)
## # A tibble: 56 x 2
##    place                   n
##    <chr>               <int>
##  1 Fiore's                25
##  2 Prince Street Pizza    20
##  3 NY Pizza Suprema       15
##  4 Joe's 14th             10
##  5 Joe's Pizza 14th       10
##  6 Little Italy Pizza     10
##  7 Pizza Mercato          10
##  8 Pizza Paradise         10
##  9 Pomodoro               10
## 10 Rocco's Pizza Joint    10
## # … with 46 more rows

looks like we have 56 pizza places

pizza_jared %>%
  count(answer)
## # A tibble: 6 x 2
##   answer          n
##   <chr>       <int>
## 1 Average        75
## 2 Excellent      75
## 3 Fair            1
## 4 Good           75
## 5 Never Again    74
## 6 Poor           75

the different kind of answer we could get

CLEANING From our earlier view we found that these cities have more than two:Ann Arbor, Atlanta, Boston, Bronx, Brooklyn, Chicago, Columbus, Indianapolis, Las Vegas, Lexington, Lousiville, Minneapolis, Morgantwon, Nantucket, New Haven, New York, Saratoga Springs, Staten Island, Youngstown

by_city <- pizza_barstool %>%
  filter(city %in% c("Ann Arbor", "Atlanta", "Boston", "Bronx", "Brooklyn", "Chicago", "Columbus", "Indianapolis", "Las Vegas", "Lexington", "Lousiville", "Minneapolis", "Morgantown", "Nantucket", "New Haven", "New York", "Saratoga Springs", "Staten Island", "Youngstown")) %>%
  group_by(city, price_level, review_stats_community_average_score)
By_city_2 <- pizza_barstool %>%
  filter(city %in% c("Ann Arbor", "Atlanta", "Boston", "Bronx", "Brooklyn", "Chicago", "Columbus", "Indianapolis", "Las Vegas", "Lexington", "Lousiville", "Minneapolis", "Morgantown", "Nantucket", "New Haven", "New York", "Saratoga Springs", "Staten Island", "Youngstown")) %>%
  select(city, review_stats_community_average_score, review_stats_critic_average_score)

After glancing at the data,and reviewing what others did on twitter, this what I have decided: 1. I want to mutate the date and time - use the function as.POSIXct to mutate the date and time to represent calendar dates and times (Note - this may come in handy for tidying my dataset for the project) 2. I want to group by place so question and answer are then associated with place 3. Summarize the votes 4. Then get the total summary of votes 5. ungroup so I can have the percent of votes

by_place <- pizza_jared %>%
  mutate(time = as.POSIXct(time, origin = "1970-01-01"), date = as.Date(time), anwser = fct_relevel(answer)) %>%
  group_by(place, question, answer) %>%
  summarize(votes = sum (votes)) %>%
  mutate(total = sum(votes)) %>%
  ungroup() %>%
  mutate(percent = votes / total,
         answer_interger = as.integer(answer),
         average = sum(answer_interger * percent))
## Warning: NAs introduced by coercion

FIGURES

Which city has the best pizza bsed on the community average score for the price level?

by_city %>%
   ggplot(aes(city, review_stats_community_average_score)) +
  geom_col() +
  theme(axis.text.x = element_text(angle = 90, hjust = .5)) +
  facet_wrap(~ price_level) + 
  labs (x = "Average Rating from Community",
        y = "City",
        title = "Tidy Tuesday: Pizza ratings from the Community Broken Down by City and Price")

Lets make a another bar graph This graph tells us, when over 50% of votes are for a rating of the pizza quality by pizza place. The puprose of this graph is to help someone narrow down what place they may want to go

by_place %>%
  filter (percent >= 0.50) %>%
  ggplot(aes(place, percent)) +
  geom_col() + 
  theme(axis.text.x = element_text(angle = 90, hjust = .5)) +
  scale_y_continuous(labels = scales::percent) +
  facet_wrap(~ answer) + 
  labs (x = "Ranking",
        y = "Pizza PLace",
        title = "Pizza Place Rankings with over 50% of Voters")

Want a minimum number of votes to be included across the “meet up”…lets do 20

answer_orders <- c("Never Again", "Poor", "Average", "Good", "Excellent")

by_place %>%
  filter(total >= 20) %>%
  ggplot(aes(answer, percent)) +
  geom_col() +
  facet_wrap(~ place) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  scale_y_continuous(labels = scales::percent) + 
  labs (x = "Ranking",
        y = "% of respondents",
        title = "Tidy Tuesday: Most Popular Pizza Places with 20 respondents")

pizza_barstool %>%
  filter(provider_review_count >= 300) %>%
  ggplot(aes(city)) +
  geom_bar() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs (x = "City",
        y = "Number of Reviews",
        title = "Cities with more than 300 reviews")